#coding:utf-8
from bs4 import BeautifulSoup
import os, sys
import string
import re
import time
import pymysql as mysql
import configparser

config=configparser.ConfigParser()	
with open("config.ini","r") as cfgfile:
	config.readfp(cfgfile)
	
	ROOT = config.get( "fileroot", "root" )




def insert_sql(sqlList):
	conn = mysql.connect( host = '192.168.30.182', port = 3306, user = 'root', passwd = 'vipdatacenter', db = 'wanli',charset='utf8mb4',)
	cur = conn.cursor()
	for sql in sqlList:
		cur.execute(sql)
	conn.commit()
	conn.close()
def  get_data(parent, filename):
	sqlList = []
	file_root = os.path.join(parent, filename)
	print(file_root)
	f = open(file_root,'r',encoding = "utf-8")
	htmlText = f.read()
	soup = BeautifulSoup(htmlText,'lxml')
	div = soup.find('span', id ='DataList1')
	all_a = div.find_all('a')
	for ta in all_a:
		url = ta.get("href")
		rawid = url.split("=")
		rawid = rawid[1]
		#print(rawid)
		sql =  "insert ignore into article (rawid) values ('%s')" %(rawid) 
		sqlList.append(sql)
	insert_sql(sqlList)
if __name__=="__main__":
  
  
	Htmlroot = os.path.join(ROOT, time.strftime('%Y%m%d',time.localtime(time.time())),"list")
	sqlList = []
	index = 0
	for parent, dirnames, filenames in os.walk(Htmlroot):
		for filename in filenames:
			if not filename.endswith('.html'):
				continue
			get_data(parent, filename)
		
		
			